df <- read.csv("data/GermanCredit.csv", header=TRUE, sep=";")
# Conversion des données catégorielles en factor
df$CHK_ACCT <- as.factor(df$CHK_ACCT)
df$HISTORY <- as.factor(df$HISTORY)
df$SAV_ACCT <- as.factor(df$SAV_ACCT)
df$EMPLOYMENT <- as.factor(df$EMPLOYMENT)
df$PRESENT_RESIDENT <- as.factor(df$PRESENT_RESIDENT)
df$JOB <- as.factor(df$JOB)
#df<- na.omit(df) # Omission des données manquantesProjet VID
Données sur les crédits Allemands
Introduction
Dans ce projet il nous est demandé de trouver les variables permettant d’obtenir le meilleur modèle de régression linéaire multiple pour déterminer si c’est une bonne idée de faire un crédit bancaire à une personne.
Analyse exploratoire des données
Vérification des données
library(summarytools)
dfSummary(df, style="grid")Data Frame Summary
df
Dimensions: 1000 x 32
Duplicates: 0
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
+====+==================+=============================+======================+=======================+==========+=========+
| 1 | OBS. | Mean (sd) : 500.5 (288.8) | 1000 distinct values | : : : : : : : : : : | 1000 | 0 |
| | [integer] | min < med < max: | (Integer sequence) | : : : : : : : : : : | (100.0%) | (0.0%) |
| | | 1 < 500.5 < 1000 | | : : : : : : : : : : | | |
| | | IQR (CV) : 499.5 (0.6) | | : : : : : : : : : : | | |
| | | | | : : : : : : : : : : | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 2 | CHK_ACCT | 1. 0 | 274 (27.4%) | IIIII | 1000 | 0 |
| | [factor] | 2. 1 | 269 (26.9%) | IIIII | (100.0%) | (0.0%) |
| | | 3. 2 | 63 ( 6.3%) | I | | |
| | | 4. 3 | 394 (39.4%) | IIIIIII | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 3 | DURATION | Mean (sd) : 20.9 (12.1) | 34 distinct values | : | 1000 | 0 |
| | [integer] | min < med < max: | | : . | (100.0%) | (0.0%) |
| | | -6 < 18 < 72 | | : : | | |
| | | IQR (CV) : 12 (0.6) | | : : : | | |
| | | | | : : : : : | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 4 | HISTORY | 1. 0 | 40 ( 4.0%) | | 1000 | 0 |
| | [factor] | 2. 1 | 49 ( 4.9%) | | (100.0%) | (0.0%) |
| | | 3. 2 | 530 (53.0%) | IIIIIIIIII | | |
| | | 4. 3 | 88 ( 8.8%) | I | | |
| | | 5. 4 | 293 (29.3%) | IIIII | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 5 | NEW_CAR | Min : 0 | 0 : 766 (76.6%) | IIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.2 | 1 : 234 (23.4%) | IIII | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 6 | USED_CAR | Min : 0 | 0 : 897 (89.7%) | IIIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.1 | 1 : 103 (10.3%) | II | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 7 | FURNITURE | Min : 0 | 0 : 819 (81.9%) | IIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.2 | 1 : 181 (18.1%) | III | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 8 | RADIO.TV | Min : 0 | 0 : 720 (72.0%) | IIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.3 | 1 : 280 (28.0%) | IIIII | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 9 | EDUCATION | Min : 0 | 0 : 950 (95.0%) | IIIIIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0 | 1 : 50 ( 5.0%) | I | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 10 | RETRAINING | Min : 0 | 0 : 903 (90.3%) | IIIIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.1 | 1 : 97 ( 9.7%) | I | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 11 | AMOUNT | Mean (sd) : 3271.3 (2822.7) | 921 distinct values | : | 1000 | 0 |
| | [integer] | min < med < max: | | : . | (100.0%) | (0.0%) |
| | | 250 < 2319.5 < 18424 | | : : | | |
| | | IQR (CV) : 2606.8 (0.9) | | : : | | |
| | | | | : : : : . | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 12 | SAV_ACCT | 1. 0 | 603 (60.3%) | IIIIIIIIIIII | 1000 | 0 |
| | [factor] | 2. 1 | 103 (10.3%) | II | (100.0%) | (0.0%) |
| | | 3. 2 | 63 ( 6.3%) | I | | |
| | | 4. 3 | 48 ( 4.8%) | | | |
| | | 5. 4 | 183 (18.3%) | III | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 13 | EMPLOYMENT | 1. 0 | 62 ( 6.2%) | I | 1000 | 0 |
| | [factor] | 2. 1 | 172 (17.2%) | III | (100.0%) | (0.0%) |
| | | 3. 2 | 339 (33.9%) | IIIIII | | |
| | | 4. 3 | 174 (17.4%) | III | | |
| | | 5. 4 | 253 (25.3%) | IIIII | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 14 | INSTALL_RATE | Mean (sd) : 3 (1.1) | 1 : 136 (13.6%) | II | 1000 | 0 |
| | [integer] | min < med < max: | 2 : 231 (23.1%) | IIII | (100.0%) | (0.0%) |
| | | 1 < 3 < 4 | 3 : 157 (15.7%) | III | | |
| | | IQR (CV) : 2 (0.4) | 4 : 476 (47.6%) | IIIIIIIII | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 15 | MALE_DIV | Min : 0 | 0 : 950 (95.0%) | IIIIIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0 | 1 : 50 ( 5.0%) | I | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 16 | MALE_SINGLE | Mean (sd) : 0.5 (0.5) | 0 : 452 (45.2%) | IIIIIIIII | 1000 | 0 |
| | [integer] | min < med < max: | 1 : 547 (54.7%) | IIIIIIIIII | (100.0%) | (0.0%) |
| | | 0 < 1 < 2 | 2 : 1 ( 0.1%) | | | |
| | | IQR (CV) : 1 (0.9) | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 17 | MALE_MAR_or_WID | Min : 0 | 0 : 908 (90.8%) | IIIIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.1 | 1 : 92 ( 9.2%) | I | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 18 | CO.APPLICANT | Min : 0 | 0 : 959 (95.9%) | IIIIIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0 | 1 : 41 ( 4.1%) | | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 19 | GUARANTOR | Mean (sd) : 0.1 (0.2) | -1 : 1 ( 0.1%) | | 1000 | 0 |
| | [integer] | min < med < max: | 0 : 947 (94.7%) | IIIIIIIIIIIIIIIIII | (100.0%) | (0.0%) |
| | | -1 < 0 < 1 | 1 : 52 ( 5.2%) | I | | |
| | | IQR (CV) : 0 (4.4) | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 20 | PRESENT_RESIDENT | 1. 1 | 130 (13.0%) | II | 1000 | 0 |
| | [factor] | 2. 2 | 308 (30.8%) | IIIIII | (100.0%) | (0.0%) |
| | | 3. 3 | 149 (14.9%) | II | | |
| | | 4. 4 | 413 (41.3%) | IIIIIIII | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 21 | REAL_ESTATE | Min : 0 | 0 : 718 (71.8%) | IIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.3 | 1 : 282 (28.2%) | IIIII | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 22 | PROP_UNKN_NONE | Min : 0 | 0 : 846 (84.6%) | IIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.2 | 1 : 154 (15.4%) | III | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 23 | AGE | Mean (sd) : 35.5 (11.8) | 54 distinct values | : | 986 | 14 |
| | [integer] | min < med < max: | | : . | (98.6%) | (1.4%) |
| | | 19 < 33 < 151 | | : : | | |
| | | IQR (CV) : 15 (0.3) | | : : . | | |
| | | | | : : : . | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 24 | OTHER_INSTALL | Min : 0 | 0 : 814 (81.4%) | IIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.2 | 1 : 186 (18.6%) | III | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 25 | RENT | Min : 0 | 0 : 821 (82.1%) | IIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.2 | 1 : 179 (17.9%) | III | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 26 | OWN_RES | Min : 0 | 0 : 287 (28.7%) | IIIII | 1000 | 0 |
| | [integer] | Mean : 0.7 | 1 : 713 (71.3%) | IIIIIIIIIIIIII | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 27 | NUM_CREDITS | Mean (sd) : 1.4 (0.6) | 1 : 633 (63.3%) | IIIIIIIIIIII | 1000 | 0 |
| | [integer] | min < med < max: | 2 : 333 (33.3%) | IIIIII | (100.0%) | (0.0%) |
| | | 1 < 1 < 4 | 3 : 28 ( 2.8%) | | | |
| | | IQR (CV) : 1 (0.4) | 4 : 6 ( 0.6%) | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 28 | JOB | 1. 0 | 22 ( 2.2%) | | 1000 | 0 |
| | [factor] | 2. 1 | 200 (20.0%) | IIII | (100.0%) | (0.0%) |
| | | 3. 2 | 630 (63.0%) | IIIIIIIIIIII | | |
| | | 4. 3 | 148 (14.8%) | II | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 29 | NUM_DEPENDENTS | Min : 1 | 1 : 845 (84.5%) | IIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 1.2 | 2 : 155 (15.5%) | III | (100.0%) | (0.0%) |
| | | Max : 2 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 30 | TELEPHONE | Min : 0 | 0 : 596 (59.6%) | IIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0.4 | 1 : 404 (40.4%) | IIIIIIII | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 31 | FOREIGN | Min : 0 | 0 : 963 (96.3%) | IIIIIIIIIIIIIIIIIII | 1000 | 0 |
| | [integer] | Mean : 0 | 1 : 37 ( 3.7%) | | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
| 32 | RESPONSE | Min : 0 | 0 : 300 (30.0%) | IIIIII | 1000 | 0 |
| | [integer] | Mean : 0.7 | 1 : 700 (70.0%) | IIIIIIIIIIIIII | (100.0%) | (0.0%) |
| | | Max : 1 | | | | |
+----+------------------+-----------------------------+----------------------+-----------------------+----------+---------+
Voici les données observées que ne jouent pas avec la donnée: valeur min DURATION -6?, MALE_SINGLE une valeur à 2, GUARANTOR -1, PRESENT_RESIDENT valeurs de 1 à 4 alors que la données dit valeurs 0 à 3, valeur max AGE 151. Pour AGE il y a 14 valeurs manquantes.
Voici les corrections validées par le client:
df<- na.omit(df)
library(ggplot2)
ggplot(df, aes(x=df$CHK_ACCT)) + geom_bar()ggplot(df, aes(x=df$SAV_ACCT)) + geom_bar()ggplot(df, aes(x=df$AGE)) + geom_bar()TODO trouver comment faire une boucle ou un truc du genre pour afficher un graphique par colonne.
Premier modèle de régression multiple
df.lm <- lm(formula = RESPONSE ~ . - OBS., data = df)
summary(df.lm)
Call:
lm(formula = RESPONSE ~ . - OBS., data = df)
Residuals:
Min 1Q Median 3Q Max
-1.06285 -0.30776 0.08616 0.28136 0.90095
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.732e-01 1.771e-01 4.367 1.4e-05 ***
CHK_ACCT1 8.250e-02 3.694e-02 2.233 0.025765 *
CHK_ACCT2 1.926e-01 5.817e-02 3.311 0.000964 ***
CHK_ACCT3 2.830e-01 3.487e-02 8.117 1.5e-15 ***
DURATION -4.514e-03 1.517e-03 -2.976 0.002991 **
HISTORY1 4.700e-03 9.001e-02 0.052 0.958366
HISTORY2 1.455e-01 7.084e-02 2.054 0.040292 *
HISTORY3 1.976e-01 7.799e-02 2.534 0.011445 *
HISTORY4 2.608e-01 7.164e-02 3.641 0.000287 ***
NEW_CAR -1.179e-01 6.094e-02 -1.935 0.053339 .
USED_CAR 9.934e-02 6.995e-02 1.420 0.155902
FURNITURE 3.491e-03 6.363e-02 0.055 0.956266
RADIO.TV 1.124e-02 6.069e-02 0.185 0.853160
EDUCATION -1.433e-01 8.013e-02 -1.788 0.074069 .
RETRAINING -1.494e-02 7.012e-02 -0.213 0.831349
AMOUNT -1.556e-05 7.239e-06 -2.150 0.031807 *
SAV_ACCT1 4.518e-02 4.442e-02 1.017 0.309438
SAV_ACCT2 8.090e-02 5.471e-02 1.479 0.139578
SAV_ACCT3 1.569e-01 6.209e-02 2.528 0.011637 *
SAV_ACCT4 1.205e-01 3.576e-02 3.369 0.000785 ***
EMPLOYMENT1 1.725e-03 7.011e-02 0.025 0.980373
EMPLOYMENT2 5.776e-02 6.688e-02 0.864 0.387988
EMPLOYMENT3 1.303e-01 6.999e-02 1.861 0.062989 .
EMPLOYMENT4 6.912e-02 6.676e-02 1.035 0.300754
INSTALL_RATE -4.407e-02 1.326e-02 -3.324 0.000923 ***
MALE_DIV -6.800e-02 6.319e-02 -1.076 0.282187
MALE_SINGLE 7.602e-02 3.208e-02 2.370 0.017987 *
MALE_MAR_or_WID 2.539e-02 4.953e-02 0.513 0.608342
CO.APPLICANT -6.803e-02 6.574e-02 -1.035 0.301023
GUARANTOR 1.770e-01 5.928e-02 2.986 0.002896 **
PRESENT_RESIDENT2 -1.237e-01 4.438e-02 -2.787 0.005419 **
PRESENT_RESIDENT3 -6.385e-02 4.994e-02 -1.279 0.201341
PRESENT_RESIDENT4 -5.693e-02 4.497e-02 -1.266 0.205859
REAL_ESTATE 3.227e-02 3.189e-02 1.012 0.311889
PROP_UNKN_NONE -8.947e-02 5.976e-02 -1.497 0.134657
AGE 1.653e-03 1.298e-03 1.273 0.203350
OTHER_INSTALL -8.211e-02 3.489e-02 -2.353 0.018820 *
RENT -1.062e-01 7.303e-02 -1.454 0.146390
OWN_RES -3.417e-02 7.024e-02 -0.486 0.626785
NUM_CREDITS -3.908e-02 2.803e-02 -1.394 0.163515
JOB1 -8.530e-02 1.033e-01 -0.825 0.409399
JOB2 -9.774e-02 1.006e-01 -0.971 0.331615
JOB3 -7.416e-02 1.023e-01 -0.725 0.468507
NUM_DEPENDENTS -3.964e-02 3.820e-02 -1.038 0.299742
TELEPHONE 5.015e-02 2.968e-02 1.690 0.091422 .
FOREIGN 1.622e-01 7.035e-02 2.306 0.021323 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3972 on 940 degrees of freedom
Multiple R-squared: 0.2853, Adjusted R-squared: 0.2511
F-statistic: 8.34 on 45 and 940 DF, p-value: < 2.2e-16
library(ggResidpanel)
resid_interact(df.lm, plots = c("resid", "qq", "cookd", "boxplot"))Recherche du meilleur modèle
```{r}
library(leaps)
Best_Subset <- regsubsets(RESPONSE~., data = df, nbest = 1, nvmax = NULL, force.in = NULL, force.out = "OBS.", method = "exhaustive")
summary_best_subset <- summary(regsubsets.out)
as.data.frame(summary_best_subset$outmat)
which.max(summary_best_subset$adjr2)
summary_best_subset$which[13,]
``````{r}
library(leaps)
models <- regsubsets(RESPONSE ~ ., data=df, nvmax=NULL, force.out = "OBS.")
summary(models)
res.sum <- summary(models)
data.frame(
Adj.R2 = which.max(res.sum$adjr2),
CP = which.min(res.sum$cp),
BIC = which.min(res.sum$bic)
)
``````{r}
get_model_formula <- function(id, object, outcome){
# get models data
models <- summary(object)$which[id,-1]
# Get outcome variable
form <- as.formula(object$call[[2]])
outcome <- all.vars(form)[1]
# Get model predictors
predictors <- names(which(models == TRUE))
predictors <- paste(predictors, collapse = "+")
# Build model formula
as.formula(paste0(outcome, "~", predictors))
}
#lm10 <- lm(get_model_formula(10, models, "RESPONSE"), data=df)
#summary(lm10)
#lm18 <- lm(get_model_formula(18, models, "RESPONSE"), data=df)
#summary(lm18)
#lm23 <- lm(get_model_formula(23, models, "RESPONSE"), data=df)
#summary(lm23)
```